1 Introduction

This file accompanies the Vizgen Analysis Guides: Cell Type Annotation.

Here we will look at the implementation of an open source package SingleR to assist in cell type annotation.

2 Loading packages

knitr::opts_chunk$set(echo = TRUE)
library(Seurat) # to run normalizaiton methods
# BiocManager::install('SingleR')
# install.packages('matrixStats')
# BiocManager::install('scran')
library(scran)
## Warning: package 'matrixStats' was built under R version 4.3.1
## Warning: package 'IRanges' was built under R version 4.3.1
## Warning: package 'GenomeInfoDb' was built under R version 4.3.1
library(SingleR)
library(data.table) # data processing library
library(ggplot2) # package to make visualizaitons
# BiocManager::install('scRNAseq')
library(scRNAseq) # package to access reference scRNA data

3 Loading Data

Here we are going to use publicaly available data from a MERSCOPE mouse brain sample.

For our purposes we only need cell_by_gene.csv and cell_metadata.csv files.

md <- fread('https://vzg-vzg-showcase-data.s3.amazonaws.com/Mouse_Brain_Showcase_232/Slice-1_Replicate-1/cell_metadata.csv',
               header = T, data.table = FALSE)

counts <- fread('https://vzg-vzg-showcase-data.s3.amazonaws.com/Mouse_Brain_Showcase_232/Slice-1_Replicate-1/cell_by_gene.csv',
               header = T, data.table = FALSE)

rownames(md) <- md$EntityID 
rownames(counts) <- counts$cell
counts <- counts[,-1] # removing cells as a column so we have a data frame of only counts
blanks <- counts[,grepl('Blank', colnames(counts))]
counts <- counts[,!grepl('Blank', colnames(counts))] # removing blanks

counts <- t(counts)

# counts[1:5,1:5]
# table(rownames(md) == colnames(counts)) # making sure the data frames align
ggplot(md, aes(x = center_x, y = -center_y)) + 
  geom_point(size = .1) + theme_bw()

3.1 Reference

Here we are going to use the Chen et al. 2017 mouse brain.

Below is a table of the cell type labels present in the data and the number of cells they have for each type.

ref <- ChenBrainData()
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
## loading from cache
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
## loading from cache
table(ref$SVM_clusterID, exclude = F)
## 
##   Astro  Ependy  Epith1  Epith2   GABA1  GABA10  GABA11  GABA12  GABA13  GABA14 
##    1148     413     818     379      23      23      35      62     165      71 
##  GABA15  GABA16  GABA17  GABA18   GABA2   GABA3   GABA4   GABA5   GABA6   GABA7 
##     112      66      50      31      27      97      18      73      49      17 
##   GABA8   GABA9    Glu1   Glu10   Glu11   Glu12   Glu13   Glu14   Glu15    Glu2 
##     402      71      13      15      42      24      24      51      25      21 
##    Glu3    Glu4    Glu5    Glu6    Glu7    Glu8    Glu9   Hista     IMO   Macro 
##      13     131     200      51     211      50      35      17     151     167 
##   Micro      MO     OPC    POPC     SCO    Tany zothers 
##     724    3541    1741      51      32     609    2348
# remotes::install_github("LTLA/scuttle")
library(scuttle)

ref <- logNormCounts(ref)

4 Creating Seurat Object

We are going to create a Seurat object out of our data.

spat <- CreateSeuratObject(counts = counts,
                           assay = 'RNA',
                           meta.data = md)
spat <- subset(spat, nCount_RNA >= 100)

spat <- NormalizeData(spat, normalization.method = 'LogNormalize',
                      scale.factor =  1e4)
spat <- ScaleData(spat, features = rownames(spat))

spat <- RunPCA(spat, features = rownames(spat), verbose = FALSE)

spat <- FindNeighbors(spat, dims = 1:10)

spat <- FindClusters(spat, resolution = .5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 67729
## Number of edges: 2018918
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9224
## Number of communities: 18
## Elapsed time: 23 seconds
spat <- RunUMAP(spat, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

5 Running SingleR

# We need the Seurat object as a single cell expeirment for SingleR
test <- as.SingleCellExperiment(spat)

# Running SingleR
pred <- SingleR(test = test, ref = ref, labels = ref$SVM_clusterID,
                de.method = 'wilcox')

spat$labels_fine <- pred$labels
spat$delta <- pred$delta.next
spat$labels_prune <- pred$pruned.labels

Below is a table of the predicted labels.

table(pred$labels)
## 
##  Astro Ependy Epith1 Epith2  GABA1 GABA10 GABA11 GABA12 GABA13 GABA14 GABA15 
##  11202     75   6264   1443   5676      7      8    100      9     21      8 
## GABA16 GABA17 GABA18  GABA2  GABA3  GABA4  GABA5  GABA6  GABA7  GABA8  GABA9 
##      7    132     22    294   2948   2402   1035    123     56      1      6 
##   Glu1  Glu10  Glu11  Glu12  Glu13  Glu14  Glu15   Glu2   Glu3   Glu4   Glu5 
##     10      6      3     11     21    199     99    144    161  10516   2374 
##   Glu6   Glu7   Glu8   Glu9  Hista    IMO  Macro  Micro     MO    OPC   POPC 
##   1749   8805      1    222    434    218    155   2003   6603   2109      6 
##    SCO   Tany 
##     19     22

5.1 General Groups

Here I’m going to combine some of the groups into larger groups for easy of analysis.

The larger groups I’m going to create are GABA, Glu, and Epith; which combine the many GABA+, Glu+, and Epith labels.

spat$labels_general <- ifelse(grepl('GABA', spat$labels_fine),'GABA',
                              ifelse(grepl('Glu', spat$labels_fine),'Glu',
                                     ifelse(grepl('Epith', spat$labels_fine), 'Epith',spat$labels_fine)))

# table(spat$labels_fine, spat$labels_general)

6 Viewing labels

We can view how the labels look by comparing them to clusters, viewing in UMAP coordinates, and viewing with the spatial context.

6.1 Clusters

We would expect cell type labels to group strongly with clusters.

df <- as.data.frame(table(spat$seurat_clusters, spat$labels_general))
# df

df$Var1.sum <- NA

for (i in unique(df$Var1)){
  df[df$Var1 == i,]$Var1.sum <- sum(df[df$Var1 == i,]$Freq)
}

df$Var2.sum <- NA

for (i in unique(df$Var2)){
  df[df$Var2 == i,]$Var2.sum <- sum(df[df$Var2 == i,]$Freq)
}

df$Var1.perc <- df$Freq / df$Var1.sum
df$Var2.perc <- df$Freq / df$Var2.sum

df$Jaccard <- df$Freq / (df$Var1.sum + df$Var2.sum - df$Freq)

order_ <- c()
cnts_ <- c()
for (i in unique(df$Var1)){
  temp <- df[df$Var1 == i,]
  order_ <- c(order_,which.max(temp$Freq))
  cnts_ <- c(cnts_,max(temp$Freq))
}
temp <- data.frame(row.names = unique(df$Var1),
                   order = order_,
                   cnts = cnts_)
df$Var1_ordered <- factor(df$Var1,
                          levels = rownames(temp[order(temp$order, -temp$cnts),]))

df$`Cluster\nPercentage` <- df$Var1.perc

ggplot(df, aes(x = Var1_ordered, y = Var2, label = Freq, fill = `Cluster\nPercentage`)) +
  geom_tile() +
  xlab('Clusters') + ylab('Cell Labels') +
  geom_text(size = 3) +
  scale_fill_gradient2(low = 'white', mid = "pink", high = 'red',
                       midpoint = .5) +
  ggtitle('Cross-Tabulation of Clusters vs Predicted Cell Labels')

We see that some clusters are made up of primarily one type of cell (ex. cluster 1 is almost exclusively astro), whereas others show a combination of multipe cell types (ex. cluster 16 is made up in large parts by epith, GABA, and Glu).

6.2 UMAP

Below is the general UMAP plot with the labels added.

DimPlot(spat)

DimPlot(spat, group.by = 'labels_general', label = T)

6.2.1 Viewing one cell type at a time

cols_ <- scales::hue_pal()(length(unique(spat$labels_general)))

names(cols_) <- levels(as.factor(spat$labels_general))
for (i in levels(as.factor(spat$labels_general))){
  # print(i)
  temp_cols <- ifelse(names(cols_) == i,cols_[i], 'lightgrey')
  counts_ <- sum(spat$labels_general == i)
  print(DimPlot(spat, group.by = 'labels_general') +
          scale_color_manual(values = temp_cols) +
          ggtitle(paste0(i, ' (n = ',counts_,')')))
}

6.3 Spatial

Now we are going to look at the labels for the cells with their spatial coordinates

ggplot(spat@meta.data, aes(x= center_x, y = -center_y)) +
  geom_point(size = .1) +
  theme_bw() + xlab('') + ylab('')

ggplot(spat@meta.data, aes(x= center_x, y = -center_y, colour = labels_general)) +
  geom_point(size = .1) +
  theme_bw() + xlab('') + ylab('') +
  guides(colour = guide_legend(override.aes= list(size = 4)))

6.3.1 Viewing one cell type at a time

temp <- spat@meta.data
for (i in levels(as.factor(spat$labels_general))){
  # print(i)
  temp_cols <- ifelse(names(cols_) == i,cols_[i], 'lightgrey')
  counts_ <- sum(spat$labels_general == i)
  temp$order <- temp$labels_general == i
  temp <- temp[order(temp$order),]
  temp$size <- ifelse(temp$order, 1,.1)
  print(ggplot(temp, aes(x= center_x, y = -center_y, colour = labels_general,
                         size = size)) +
          geom_point() +
          scale_size_identity() +
          theme_bw() + xlab('') + ylab('') +
          scale_color_manual(values = temp_cols) +
          guides(colour = guide_legend(override.aes= list(size = 4))))
}